library(Seurat)
library(Matrix)
library(dplyr)
library(tidyr)
library(ggplot2)
library(tibble)
library(cowplot)
library(patchwork)
library(stringr)
library(pheatmap)
library(reshape2)
library(readxl)
library(stringr)
library(DESeq2)
library(gridExtra)
library(ggrepel)
library(ggpubr)
library(tidyverse)
library(viridisLite)
theme_set(theme_classic())
data_dir = "/home/zli4/MSK_DEC24/Analysis"
out_dir = "/fh/fast/sun_w/zhaoheng_li/MSK_DEC24/Analysis/outs/anova"integrated = readRDS(file.path(data_dir,"integrated-rpca_BDF-ref_withMeta.RDS"))# remove cells without cell type annotations
keep.cells= rownames(integrated@meta.data)[
!is.na(integrated@meta.data$celltype)
]
integrated= subset(
integrated,
cells = keep.cells
)
integrated$celltype = as.factor(integrated$celltype)
summary(integrated$celltype)## DE Ductal Endothelial EnP
## 16283 18880 1114 19845
## ESC ESC_DE Liver Mesenchymal_Stromal
## 15280 4559 9363 1698
## PFG PGT PP SC-alpha
## 8440 2627 14249 9684
## SC-beta SC-delta SC-EC Stromal
## 10152 1856 13743 3944
celltype_orders = c(
"ESC",
"ESC_DE",
"DE",
"PFG",
"PGT",
"PP",
"EnP",
"SC-EC",
"SC-alpha",
"SC-beta",
"SC-delta",
"Ductal",
"Mesenchymal_Stromal",
"Stromal",
"Endothelial",
"Liver"
)
integrated$celltype = factor(integrated$celltype,
levels =celltype_orders ,
ordered = TRUE)
celltype_palette = readRDS("/fh/fast/sun_w/zhaoheng_li/MSK_DEC24/Analysis/outs/celltype_palette.RDS")obj = integrated%>%subset(subset = time_point%in%c("D-1","D3","D7","D11","D18"))
obj$time_point = droplevels(obj$time_point)
summary(obj$time_point)## D-1 D3 D7 D11 D18
## 15775 18602 16943 12052 36357
obj_list = SplitObject(obj, split.by = "time_point")
obj_list## $D18
## An object of class Seurat
## 36390 features across 36357 samples within 1 assay
## Active assay: RNA (36390 features, 2000 variable features)
## 3 layers present: data, counts, scale.data
## 4 dimensional reductions calculated: pca, umap, integrated.rpca, rpca.umap
##
## $D7
## An object of class Seurat
## 36390 features across 16943 samples within 1 assay
## Active assay: RNA (36390 features, 2000 variable features)
## 3 layers present: data, counts, scale.data
## 4 dimensional reductions calculated: pca, umap, integrated.rpca, rpca.umap
##
## $D3
## An object of class Seurat
## 36390 features across 18602 samples within 1 assay
## Active assay: RNA (36390 features, 2000 variable features)
## 3 layers present: data, counts, scale.data
## 4 dimensional reductions calculated: pca, umap, integrated.rpca, rpca.umap
##
## $D11
## An object of class Seurat
## 36390 features across 12052 samples within 1 assay
## Active assay: RNA (36390 features, 2000 variable features)
## 3 layers present: data, counts, scale.data
## 4 dimensional reductions calculated: pca, umap, integrated.rpca, rpca.umap
##
## $`D-1`
## An object of class Seurat
## 36390 features across 15775 samples within 1 assay
## Active assay: RNA (36390 features, 2000 variable features)
## 3 layers present: data, counts, scale.data
## 4 dimensional reductions calculated: pca, umap, integrated.rpca, rpca.umap
f_pseudobulk_list = file.path(out_dir,"pseudobulk_list.RDS")
if(file.exists(f_pseudobulk_list)){
pseudo_obj_list = readRDS(f_pseudobulk_list)
}else{
pseudo_obj_list = list()
for(tp in c("D-1","D3","D7","D11", "D18")){
obj = obj_list[[tp]]
obj$celltype = as.factor(obj$celltype)
obj$background = as.factor(obj$background)
obj$genotype = as.factor(obj$genotype)
levels(obj$celltype)=gsub("_","-",levels(obj$celltype)) # rename celltype levels
pseudo_obj = AggregateExpression(obj, assays = "RNA", return.seurat = T,
group.by = c("sample_name","feature.4", "celltype"))
pseudo_obj$sample_name = gsub("-","_",pseudo_obj$sample_name)
# process metadata
pseudo_meta = pseudo_obj[[]]
pseudo_meta$pseudo.sample.id = pseudo_meta$orig.ident
obj$pseudo.sample.id = as.factor(paste(obj$sample_name,obj$feature.4,obj$celltype,sep='_'))
obj$pseudo.sample.id = gsub("-","_", obj$pseudo.sample.id)
pseudo_meta$pseudo.sample.id = gsub("-","_",rownames(pseudo_meta))
pseudo_meta = pseudo_meta %>%
left_join(obj[[]]%>%select(pseudo.sample.id,genotype,source,background)%>%distinct(),
by = c("pseudo.sample.id"))
pseudo_meta$celltype = as.factor(pseudo_meta$celltype)
pseudo_meta$genotype = as.factor(pseudo_meta$genotype)
n_df=obj[[]]%>%dplyr::count(pseudo.sample.id)
pseudo_meta = pseudo_meta%>%left_join(n_df)
rownames(pseudo_meta) = pseudo_meta$orig.ident
pseudo_obj@meta.data = pseudo_meta
pseudo_obj$celltype = as.character( pseudo_obj$celltype)
pseudo_obj$celltype = ifelse(pseudo_obj$celltype == 'ESC-DE','ESC_DE',
ifelse(pseudo_obj$celltype == 'Mesenchymal-Stromal','Mesenchymal_Stromal',pseudo_obj$celltype ))
pseudo_obj$celltype = as.factor(pseudo_obj$celltype)
pseudo_obj$genotype = as.factor(pseudo_obj$genotype)
pseudo_obj_list[[tp]] = pseudo_obj
}
names(pseudo_obj_list) = c("D-1","D3","D7","D11", "D18")
saveRDS(pseudo_obj_list,f_pseudobulk_list)
}pseudo_obj_list = lapply(pseudo_obj_list,function(pseudo_obj){
pseudo_obj$celltype = factor(pseudo_obj$celltype,
levels =celltype_orders ,
ordered = TRUE)
pseudo_obj
})min_cells = 30
pseudo_obj_list = lapply(pseudo_obj_list,function(pseudo_obj){
pseudo_obj%>%subset(subset = n>=min_cells)
})
pseudo_obj_list## $`D-1`
## An object of class Seurat
## 36390 features across 93 samples within 1 assay
## Active assay: RNA (36390 features, 0 variable features)
## 3 layers present: counts, data, scale.data
##
## $D3
## An object of class Seurat
## 36390 features across 129 samples within 1 assay
## Active assay: RNA (36390 features, 0 variable features)
## 3 layers present: counts, data, scale.data
##
## $D7
## An object of class Seurat
## 36390 features across 136 samples within 1 assay
## Active assay: RNA (36390 features, 0 variable features)
## 3 layers present: counts, data, scale.data
##
## $D11
## An object of class Seurat
## 36390 features across 92 samples within 1 assay
## Active assay: RNA (36390 features, 0 variable features)
## 3 layers present: counts, data, scale.data
##
## $D18
## An object of class Seurat
## 36390 features across 204 samples within 1 assay
## Active assay: RNA (36390 features, 0 variable features)
## 3 layers present: counts, data, scale.data
for each time point, use top 5000 highly variable genes (HVGs)
pca_features = list()
for(tp in c("D-1","D3","D7","D11","D18")){
pseudo_obj = pseudo_obj_list[[tp]]
pseudo_obj = FindVariableFeatures(pseudo_obj,nfeatures = 5000)
pca_features[[tp]] = VariableFeatures(pseudo_obj)
pseudo_obj = ScaleData(pseudo_obj,verbose = F)
pseudo_obj=RunPCA(pseudo_obj,npcs = min(50,ncol(pseudo_obj)-1),verbose = F)
pve <- (pseudo_obj[["pca"]]@stdev)^2 # eigen-values (= variance of each PC)
pve <- pve / sum(pve) # proportion of total variance
pc_labs = function(i) sprintf("PC%d (%.1f%%)", i, 100 * pve[i])
df = as.data.frame(cbind(Embeddings(pseudo_obj, reduction = "pca")[,1:2],pseudo_obj[[]]))
df$celltype = factor(df$celltype,
levels =celltype_orders ,
ordered = TRUE)
df$celltype = droplevels(df$celltype)
# set shapes
shape_values <- c(16, 17, 15, 3, 7, 8, 18, 0, 1, 2)
shape_values = shape_values[1:length(unique(df$celltype))]
names(shape_values) <- unique(df$celltype)
fig1 <- ggplot(df, aes(x = PC_1,
y = PC_2,
color = celltype,
shape = celltype)) +
geom_point(size = 1,
alpha = 0.8) + # 80% opacity
scale_shape_manual(values = shape_values) + # assign each celltype its own shape
labs(
title = "",
x = pc_labs(1),
y = pc_labs(2),
color = "cell type",
shape = "cell type"
) +
scale_color_manual(
values = celltype_palette,
drop = TRUE
)+
theme_classic()+
guides(
color = guide_legend(
override.aes = list(size = 4) # Increase legend point size
)
)
fig2 = df%>%ggplot(aes(x=PC_1,y=PC_2,color = n))+
geom_point(size=1)+
viridis::scale_color_viridis()+
labs(
title = "",
x = pc_labs(1),
y = pc_labs(2),
color = "# cells"
)+
theme_classic()
fig3 = df%>%ggplot(aes(x=PC_1,y=PC_2,color = background))+
geom_point(size=1)+
labs(
title = "",
x = pc_labs(1),
y = pc_labs(2),
)+
theme_classic()+
guides(
color = guide_legend(
override.aes = list(size = 4) # Increase legend point size
)
)
fig4 = df%>%ggplot(aes(x=PC_1,y=PC_2,color = source))+
geom_point(size=1)+
labs(
title = "",
x = pc_labs(1),
y = pc_labs(2),
)+
theme_classic()+
guides(
color = guide_legend(
override.aes = list(size = 4) # Increase legend point size
)
)
fig5 = df%>%ggplot(aes(x=PC_1,y=PC_2,color = genotype,shape=background,label=feature.4))+
geom_point(size=1)+
ggrepel::geom_text_repel(aes(label = feature.4),size =2) +
labs(
title = "",
x = pc_labs(1),
y = pc_labs(2),
)+
theme_classic()+
theme(legend.position = "none")
fig = gridExtra::grid.arrange(
fig1, fig2, fig3, fig4, fig5,
layout_matrix = rbind(
c(1, 2), # row-1
c(3, 4), # row-2
c(5, 5) # row-3 ← fig5 spans both columns
),
heights = c(1, 1, 2) # give fig5 double the row-height
)
ggsave(file.path(out_dir,paste0("pc_plots/",tp,".png",sep='')),fig,width = 10,height = 12)
}if(file.exists( file.path(out_dir,"pca_features.RDS"))){
pca_features = readRDS(file.path(out_dir,"pca_features.RDS"))
}else{
saveRDS(pca_features,file.path(out_dir,"pca_features.RDS"))
}for(tp in c("D-1","D3","D7","D11","D18")){
obj = pseudo_obj_list[[tp]]
# filter genotypes
obj$genotype = as.factor(obj$genotype)
tmp = obj[[]]%>%select(genotype,feature.4)%>%distinct()
obj = obj%>%subset(subset = genotype%in%names(which(summary(tmp$genotype)>=2)))
obj$genotype = droplevels(obj$genotype)
pseudo_obj_list[[tp]] = obj
}for(tp in c("D-1","D3","D7","D11","D18")){
if(!file.exists(file.path(out_dir,paste0(tp,"_pseudo_lm_res.RDS",sep='')))){
features = pca_features[[tp]] # time-point specific HVGs
print(length(features))
print(tp)
obj = pseudo_obj_list[[tp]]
# filter genotypes
obj$genotype = as.factor(obj$genotype)
tmp = obj[[]]%>%select(genotype,feature.4)%>%distinct()
obj = obj%>%subset(subset = genotype%in%names(which(summary(tmp$genotype)>=2)))
obj$genotype = droplevels(obj$genotype)
meta = obj[[]]%>%dplyr::select(source,background,celltype,genotype,feature.4)
meta$celltype = as.factor(meta$celltype)
dat = GetAssayData(obj,layer='data')[features,]
counts = GetAssayData(obj,layer='counts')[features,]
rd=colSums(counts)
meta$rd = rd
meta$genotype = as.factor( meta$genotype) # include genotypes with only 1 clones
meta$sample = rownames(meta)
summary( meta$genotype)
meta$genotype = relevel( meta$genotype,ref="WT")
genotype_mtx =model.matrix(~ genotype, meta) # craete dummy variables
colnames(genotype_mtx)[1] = "genotype_ref"
genotype_mtx = as.data.frame(genotype_mtx)
genotype_mtx$sample = rownames(genotype_mtx)
meta = meta%>%left_join(genotype_mtx,by = join_by(sample))
if(tp=="D-1"){ # handle D-1 data
df= parallel::mclapply(features,function(g){
y_df = meta; y_df$y = dat[g,]
mod0 = lm(y~rd,data=y_df)
mod1 = lm(y~rd+source,data=y_df)
mod2 = lm(y~rd+source+background,data=y_df)
mod3 = mod2
r2 = c(summary(mod0)$r.squared,
summary(mod1)$r.squared-summary(mod0)$r.squared, # data source
summary(mod2)$r.squared-summary(mod1)$r.squared, # cell background
NA)
names(r2) = c("read-depth","data-source","cell-background","cell-type")
genotype_KO = setdiff(levels(meta$genotype),"WT")
genotype_r2 = rep(NA,length(genotype_KO));names(genotype_r2) = genotype_KO
genotype_pval = rep(NA,length(genotype_KO));names(genotype_pval) = paste("pval",genotype_KO,sep="_")
for(geno in genotype_KO){
# full model
df0 = y_df%>%dplyr::select(-feature.4,-genotype,-genotype_ref,-sample,-celltype)
mod_full = lm(y~.,data = df0)
v = as.symbol(paste0("genotype",geno))
# remove genotype of interest
df1 = y_df%>%dplyr::select(-feature.4,-genotype,-genotype_ref,-sample,-paste0("genotype",geno),-celltype)
mod4 = lm(y~.,data = df1)
genotype_r2[geno] = summary(mod_full)$r.squared-summary(mod4)$r.squared
genotype_pval[paste("pval",geno,sep="_")] = summary(mod_full)$coefficients[setdiff( names(mod_full$coefficients), names(mod4$coefficients)),"Pr(>|t|)"]
}
res = c(r2,genotype_r2,genotype_pval)
return(res)
},mc.cores = 20)
}else{
tictoc::tic()
df= parallel::mclapply(features,function(g){
y_df = meta; y_df$y = dat[g,]
mod0 = lm(y~rd,data=y_df)
mod1 = lm(y~rd+source,data=y_df)
mod2 = lm(y~rd+source+background,data=y_df)
mod3 = lm(y~rd+source+background+celltype,data=y_df)
r2 = c(summary(mod0)$r.squared,
summary(mod1)$r.squared-summary(mod0)$r.squared, # data source
summary(mod2)$r.squared-summary(mod1)$r.squared, # cell background
ifelse(summary(mod3)$r.squared-summary(mod2)$r.squared==0,NA,summary(mod3)$r.squared-summary(mod2)$r.squared))
names(r2) = c("read-depth","data-source","cell-background","cell-type")
genotype_KO = setdiff(levels(meta$genotype),"WT")
genotype_r2 = rep(NA,length(genotype_KO));names(genotype_r2) = genotype_KO
genotype_pval = rep(NA,length(genotype_KO));names(genotype_pval) = paste("pval",genotype_KO,sep="_")
for(geno in genotype_KO){
# full model
df0 = y_df%>%dplyr::select(-feature.4,-genotype,-genotype_ref,-sample)
mod_full = lm(y~.,data = df0)
v = as.symbol(paste0("genotype",geno))
# remove genotype of interest
df1 = y_df%>%dplyr::select(-feature.4,-genotype,-genotype_ref,-sample,-paste0("genotype",geno))
mod4 = lm(y~.,data = df1)
genotype_r2[geno] = summary(mod_full)$r.squared-summary(mod4)$r.squared
genotype_pval[paste("pval",geno,sep="_")] = summary(mod_full)$coefficients[setdiff( names(mod_full$coefficients), names(mod4$coefficients)),"Pr(>|t|)"]
}
res = c(r2,genotype_r2,genotype_pval)
return(res)
},mc.cores = 20)
tictoc::toc()
}
df=do.call("rbind", df)
rownames(df) = features
saveRDS(df,file.path(out_dir,paste0(tp,"_pseudo_lm_res.RDS",sep='')))
}
}only time-point specific HVGs with <75% percent zeros across cells are considered
#pdf(paste(out_dir,"htmap.pdf",sep=''))
for(tp in c("D-1","D3","D7","D11","D18")){
features_tp = pca_features[[tp]]
pseudo_obj = pseudo_obj_list[[tp]]
dat = GetAssayData(pseudo_obj)[features_tp,]
zero_prop = apply(dat,1,function(x){
sum(x==0)/length(x)
})
features_tp = features_tp[which(zero_prop<quantile(zero_prop,0.75))]
lm_res = readRDS(file.path(out_dir,paste0(tp,"_pseudo_lm_res.RDS",sep='')))
if(tp=="D-1"){
mtx=lm_res[,-c(4,which(startsWith(colnames(lm_res), "pval_")))]
}else{
mtx =lm_res[,-c(which(startsWith(colnames(lm_res), "pval_")))]
}
mtx = mtx[features_tp,] # keep on tp-specific genes
if(sum(is.na(rowSums(mtx)))!=0){
mtx=mtx[-which(is.na(rowSums(mtx))),]
}
pvals = lm_res[,which(startsWith(colnames(lm_res), "pval_"))]
pvals=pvals[rownames(mtx),]
genotypes = sub(".*_", "", colnames(pvals))
for(g in rownames(mtx)){ # keep only r2 corresponding to significant coefficients
for(genotype in genotypes){
p = p.adjust(pvals[g,paste("pval",genotype,sep="_")], method = "fdr")
mtx[g,genotype] = ifelse(p<0.05,mtx[g,genotype],0)
}
}
print(dim(mtx))
mtx = log10(mtx+10^{-12})
mtx0 = mtx
mtx0[which(mtx0<(-4))] = -4 # truncate
mtx0_sub = mtx0[,-(1:4)] # genotypes only
breaks = c(seq(min(mtx0),-1,length.out=30),seq(-1,max(mtx0),length.out=50)[-1])
breaks1 = c(seq(min(mtx0_sub),-1,length.out=30),seq(-1,max(mtx0_sub),length.out=50)[-1])
htmap = pheatmap::pheatmap(mtx0,main = paste(tp, ": log10(proportion var. explained)",sep=""),cluster_cols = F,scale = 'none',show_rownames=F,color=plasma(length(breaks) - 1),clustering_method = "ward.D")
print(htmap)
print(pheatmap::pheatmap(mtx0_sub,main = paste(tp, ": log10(proportion var. explained): genotypes",sep=""),scale = 'none',show_rownames=F,color=plasma(length(breaks1) - 1),clustering_method = "ward.D"))
}## [1] 3746 29
## [1] 3749 29
## [1] 3750 19
## [1] 3691 19
## [1] 3745 21
for(tp in c("D-1","D3","D7","D11","D18")){
pseudo_obj = pseudo_obj_list[[tp]]
pseudo_obj$celltype = droplevels(pseudo_obj$celltype)
print(summary(pseudo_obj$celltype))
}## ESC ESC_DE
## 71 12
## ESC ESC_DE DE
## 7 31 76
## DE PFG Stromal Endothelial Liver
## 3 46 6 4 50
## PFG PP EnP Stromal Endothelial Liver
## 4 48 14 6 3 1
## PP EnP SC-EC SC-alpha
## 3 25 25 24
## SC-beta SC-delta Ductal Mesenchymal_Stromal
## 16 8 43 1
## Stromal Liver
## 12 4
mtx_lst = list()
for(tp in c("D-1","D3","D7","D11","D18")){
features_tp = pca_features[[tp]]
pseudo_obj = pseudo_obj_list[[tp]]
dat = GetAssayData(pseudo_obj)[features_tp,]
zero_prop = apply(dat,1,function(x){
sum(x==0)/length(x)
})
features_tp = features_tp[which(zero_prop<quantile(zero_prop,0.75))]
lm_res = readRDS(file.path(out_dir,paste0(tp,"_pseudo_lm_res.RDS",sep='')))
if(tp=="D-1"){
mtx=lm_res[,-c(4,which(startsWith(colnames(lm_res), "pval_")))]
}else{
mtx =lm_res[,-c(which(startsWith(colnames(lm_res), "pval_")))]
}
mtx = mtx[features_tp,] # keep on tp-specific genes
if(sum(is.na(rowSums(mtx)))!=0){
mtx=mtx[-which(is.na(rowSums(mtx))),]
}
pvals = lm_res[,which(startsWith(colnames(lm_res), "pval_"))]
pvals=pvals[rownames(mtx),]
genotypes = sub(".*_", "", colnames(pvals))
print(dim(mtx))
mtx = log10(mtx+10^{-12})
# for each gene, adjust genotype p-valus by FDR
# keep only r2 corresponding to significant coefficients
for(g in rownames(mtx)){
for(genotype in genotypes){
p = p.adjust(pvals[g,paste("pval",genotype,sep="_")], method = "fdr")
mtx[g,genotype] = ifelse(p<0.05,mtx[g,genotype],-Inf)
}
}
df = as.data.frame(mtx)
df$time = tp
df$gene = rownames(df)
mtx_lst[[tp]] = df
}## [1] 3746 29
## [1] 3749 29
## [1] 3750 19
## [1] 3691 19
## [1] 3745 21
# enforce all matrices have the same columns
names = unique(unlist(sapply(mtx_lst,function(df){
colnames(df)
})))
mtx_lst = lapply(mtx_lst,function(df){
df[,setdiff(names,colnames(df))] = NA
return(df[,names])
})
df = do.call(rbind,mtx_lst)
df_long = reshape2::melt(df, id.vars = c("time","gene"), variable.name = "Factor", value.name = "Value")
df_long$time = factor(df_long$time,levels = c("D-1","D3","D7","D11","D18"))
df_long[1:2,]## time gene Factor Value
## 1 D-1 CHCHD2 read-depth -0.7030275
## 2 D-1 CHGA read-depth -0.2560885
for(tp in c("D3","D7","D11","D18")){
print(tp)
pseudo_obj = pseudo_obj_list[[tp]]
# remove cell types that only occur in one genotype
df= pseudo_obj[[]]
idx = (df%>%
group_by(celltype)%>%
summarise(n_distinct(genotype)))[,2]>=1
celltypes = (df%>%group_by(celltype)%>%summarise(n_distinct(genotype)))[,1][idx]
pseudo_obj = pseudo_obj%>%subset(subset = celltype%in%celltypes)
pseudo_obj$celltype = droplevels(pseudo_obj$celltype)
print(levels(pseudo_obj$celltype))
}## [1] "D3"
## [1] "ESC" "ESC_DE" "DE"
## [1] "D7"
## [1] "DE" "PFG" "Stromal" "Endothelial" "Liver"
## [1] "D11"
## [1] "PFG" "PP" "EnP" "Stromal" "Endothelial"
## [6] "Liver"
## [1] "D18"
## [1] "PP" "EnP" "SC-EC"
## [4] "SC-alpha" "SC-beta" "SC-delta"
## [7] "Ductal" "Mesenchymal_Stromal" "Stromal"
## [10] "Liver"
Proportion variance explained by cell type is adjusted by degree of freedom (number of cell types -1) in the panel “cell-type-scaled”.
df_long_scaled = df_long
df_long_scaled$Value <- ifelse(df_long$time =="D3" & df_long$Factor == "cell-type", df_long_scaled$Value-log10(2), df_long_scaled$Value)
df_long_scaled$Value <- ifelse(df_long$time =="D7" & df_long$Factor == "cell-type", df_long_scaled$Value-log10(4), df_long_scaled$Value)
df_long_scaled$Value <- ifelse(df_long$time =="D11" & df_long$Factor == "cell-type", df_long_scaled$Value-log10(5), df_long_scaled$Value)
df_long_scaled$Value <- ifelse(df_long$time =="D18" & df_long$Factor == "cell-type", df_long_scaled$Value-log10(9), df_long_scaled$Value)
df_long_scaled = df_long_scaled%>%filter(Factor=='cell-type')
df_long_scaled$Factor = 'cell-type-scaled'
unique(df_long$Factor)## [1] read-depth data-source cell-background BMPR1A
## [5] FOXA1 FOXA2 GATA4 GATA4het
## [9] GATA6 GATA6het GLIS3 GSC
## [13] HHEX HHEXe MNX1 NANOGehet
## [17] NEUROD1 NGN3 NKX2-2 ONECUT1e
## [21] OTUD5 PAX6 PBX1 PDX1
## [25] QSER1 QSER1TET1 RFX6 TET1/2/3
## [29] TLE3 cell-type
## 30 Levels: read-depth data-source cell-background BMPR1A FOXA1 FOXA2 ... cell-type
df_long = rbind(df_long,df_long_scaled)
fig1_scaled=df_long%>%filter(Factor%in%c("read-depth", "data-source",
"cell-background","cell-type","cell-type-scaled"))%>%
ggplot(aes(x = time,y =Value,color = time))+
geom_violin()+
geom_boxplot(width = 0.1, fill = "white", aes(color = time),outlier.size = 0.5) + # Boxplot
ylab("log10(proportion \n variance explained)")+
xlab("")+
theme_bw()+
scale_color_brewer(palette = "Set1")+
guides(color="none")+
facet_wrap(~Factor)
fig1_scaled## Warning: Removed 7492 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 7492 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
ggsave(
filename = file.path(out_dir,"propVar_summary.png"),
plot = fig1_scaled,
width = 6,
height = 3,
units = "in",
dpi = 300
)## Warning: Removed 7492 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Removed 7492 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Same as above, except for truncating y-axis at -4
fig1_scaled_truncated = df_long%>%
mutate(Value = ifelse(df_long$Value<(-4),NA,df_long$Value))%>%
filter(Factor%in%c("read-depth","data-source","cell-background","cell-type","cell-type-scaled"))%>%
ggplot(aes(x = time,y =Value,color = time))+
geom_violin()+
geom_boxplot(width = 0.1, fill = "white", aes(color = time),outlier.size = 0.5) + # Boxplot
ylab("log10(proportion \n variance explained)")+
xlab("")+
theme_bw()+
scale_color_brewer(palette = "Set1")+
guides(color="none")+
facet_wrap(~Factor)
fig1_scaled_truncated## Warning: Removed 9692 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 9692 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
ggsave(
filename = file.path(out_dir,"propVar_summary_truncated.png"),
plot = fig1_scaled_truncated,
width = 6,
height = 3,
units = "in",
dpi = 300
)## Warning: Removed 9692 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Removed 9692 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Only significant r-squares are kept for visualization.
Numbers on top indicates the number of genotypes incorporated in the analysis at each time point
# subset data frame for genotype: include only genes with FDR<0.05
df_long1 = df_long%>%
filter(!Factor%in%c("read-depth","data-source","cell-background","cell-type","cell-type-scaled"))
#mutate(Value = ifelse(df_long1$Value<(-4),NA,df_long1$Value))
df_long1$id = paste(df_long1$time,df_long1$Factor,sep="_")
write.csv(df_long1,file.path(out_dir,"genotype_significant_propvar.csv"),row.names = FALSE)# summarize the number of genotypes involved in the analysis for each day
ann = df_long1 %>%
filter(!is.infinite(Value))%>%
group_by(time, Factor) %>%
summarise(
median = median(Value, na.rm = TRUE)
)%>%
ungroup()%>%
group_by(time)%>%
summarise(num_genotype = sum(!is.na(median)))## `summarise()` has grouped output by 'time'. You can override using the
## `.groups` argument.
fig2=df_long1 %>%
filter(!is.infinite(Value))%>%
group_by(time, Factor) %>%
summarise(
median = median(Value,na.rm=T)
)%>%ggplot(aes(x=time,y=median,color = time))+
geom_violin()+
geom_boxplot(width = 0.1, aes(color = time)) + # Boxplot within violin, without outliers
geom_text(data =ann,
aes(x = time, y =-1, label = num_genotype),
size = 3, color = "black")+
ylab( "medians of \n log10(proportion \n variance explained)") +
xlab("")+
ggtitle("all available genotypes")+
scale_color_brewer(palette = "Set1")+
guides(color = "none")+
theme_bw()## `summarise()` has grouped output by 'time'. You can override using the
## `.groups` argument.
fig2## Warning: Removed 32 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 32 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
ggsave(
filename = file.path(out_dir,"genotype_median_summary.png"),
plot = fig2,
width = 6,
height = 3,
units = "in",
dpi = 300
)## Warning: Removed 32 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Removed 32 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
significant_prop <- df_long1 %>%
group_by(time, Factor) %>%
summarise(
total = n(),
value_count = sum((!is.infinite(Value))),
prop= value_count / total
)## `summarise()` has grouped output by 'time'. You can override using the
## `.groups` argument.
# prop = 1 only happens when the genotype is not included in the analysis (didn't pass filtering criteria)
significant_prop = significant_prop%>%filter(prop!=1)
significant_prop$id = paste(significant_prop$time,significant_prop$Factor,sep="_")Plot significant proportion variance explained by each genotype at each time point with the proportion of genes with significant proportion variance explained by the genotype
summary((df_long1%>%
filter(!is.infinite(Value)))$Value)## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -4.35 -1.80 -1.55 -1.60 -1.35 -0.28 119305
value_range <- range((df_long1%>%
filter(!is.infinite(Value)))$Value, na.rm = TRUE)
# Find the range of the proportion of non-NA values
prop_range <- c(0,0.3) #range(significant_prop$prop[significant_prop$prop!=1], na.rm = TRUE)
# Create the plot
fig3=df_long1%>%
filter(!is.infinite(Value))%>%
ggplot(aes(x = time, y = Value)) +
geom_violin(trim = T, alpha = 0.5) + # Violin plot
geom_boxplot(width = 0.1,outlier.size = 0.5) + # Boxplot within violin, without outliers
geom_point(data = significant_prop%>%filter(prop!=1),
aes(x = time, y = (prop - min(prop_range)) / diff(prop_range) * diff(value_range) + min(value_range)),
color = 'maroon',shape=8,size = 1, position = position_dodge(width = 0.9)) + # Points for proportion of non-NA values scaled to the primary y-axis
scale_y_continuous(
sec.axis = sec_axis(~ (. - min(value_range)) / diff(value_range) * diff(prop_range) + min(prop_range), name = "Proportion of genes with FDR<0.05")
) +
theme_bw() +
labs(x = "",
y = "log10(proportion variance explained)") +
facet_wrap(~Factor)
fig3## Warning: Removed 119305 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 119305 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
ggsave(
filename = file.path(out_dir,"genotype_timepoint_propVar.png"),
plot = fig3,
width = 10,
height = 10,
units = "in",
dpi = 300
)## Warning: Removed 119305 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Removed 119305 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Missing point means that at that time point the genotype is not tested due to the lack of sufficient pseudobulk samples
fig4 = significant_prop%>%
filter(prop!=1)%>%
ggplot(aes(x=time,y=prop,group = Factor,color = Factor))+
geom_point()+
geom_line()+
ylab("proportion of genes with FDR<0.05")+
theme_bw()+
theme(legend.position = "none")+
facet_wrap(~Factor)
fig4## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
ggsave(
filename = file.path(out_dir,"genotype_proportionSignificant.png"),
plot = fig4,
width = 10,
height = 8,
units = "in",
dpi = 300
)## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
fig5 = significant_prop%>%
filter(prop!=1)%>%
ggplot(aes(x = time,y=prop,color = time))+
geom_violin(trim = T, alpha = 0.5) + # Violin plot
geom_boxplot(width = 0.1,outlier.size = 0.5,aes(color = time)) +
xlab("")+
ylab("median proportion of genes with FDR<0.05")+
guides(color="none")+
theme_bw()
fig5ggsave(
filename = file.path(out_dir,"celltype_prop_time.png"),
plot = fig5,
width = 8,
height = 4,
units = "in",
dpi = 300
)gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 8459573 451.8 14173694 757.0 14173694 757.0
## Vcells 3288123190 25086.4 6254144870 47715.4 6139780971 46842.9
sessionInfo()## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 18.04.6 LTS
##
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS OPENBLAS; LAPACK version 3.11.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/Los_Angeles
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] viridisLite_0.4.2 lubridate_1.9.3
## [3] forcats_1.0.0 purrr_1.0.2
## [5] readr_2.1.5 tidyverse_2.0.0
## [7] ggpubr_0.6.0 ggrepel_0.9.5
## [9] gridExtra_2.3 DESeq2_1.44.0
## [11] SummarizedExperiment_1.34.0 Biobase_2.64.0
## [13] MatrixGenerics_1.16.0 matrixStats_1.3.0
## [15] GenomicRanges_1.56.0 GenomeInfoDb_1.40.0
## [17] IRanges_2.38.0 S4Vectors_0.42.0
## [19] BiocGenerics_0.50.0 readxl_1.4.3
## [21] reshape2_1.4.4 pheatmap_1.0.12
## [23] stringr_1.5.1 patchwork_1.2.0
## [25] cowplot_1.1.3 tibble_3.2.1
## [27] ggplot2_3.5.1 tidyr_1.3.1
## [29] dplyr_1.1.4 Matrix_1.7-0
## [31] Seurat_5.1.0 SeuratObject_5.0.2
## [33] sp_2.1-4
##
## loaded via a namespace (and not attached):
## [1] RcppAnnoy_0.0.22 splines_4.4.0 later_1.3.2
## [4] cellranger_1.1.0 polyclip_1.10-6 fastDummies_1.7.3
## [7] lifecycle_1.0.4 rstatix_0.7.2 globals_0.16.3
## [10] lattice_0.22-6 MASS_7.3-60.2 backports_1.4.1
## [13] magrittr_2.0.3 plotly_4.10.4 sass_0.4.9
## [16] rmarkdown_2.26 jquerylib_0.1.4 yaml_2.3.8
## [19] httpuv_1.6.15 sctransform_0.4.1 spam_2.10-0
## [22] spatstat.sparse_3.0-3 reticulate_1.36.1 pbapply_1.7-2
## [25] RColorBrewer_1.1-3 abind_1.4-5 zlibbioc_1.50.0
## [28] Rtsne_0.17 GenomeInfoDbData_1.2.12 irlba_2.3.5.1
## [31] listenv_0.9.1 spatstat.utils_3.0-4 goftest_1.2-3
## [34] RSpectra_0.16-1 spatstat.random_3.2-3 fitdistrplus_1.2-1
## [37] parallelly_1.37.1 leiden_0.4.3.1 codetools_0.2-20
## [40] DelayedArray_0.30.1 tidyselect_1.2.1 UCSC.utils_1.0.0
## [43] farver_2.1.2 viridis_0.6.5 spatstat.explore_3.2-7
## [46] jsonlite_1.8.8 progressr_0.14.0 ggridges_0.5.6
## [49] survival_3.6-4 systemfonts_1.1.0 tools_4.4.0
## [52] ragg_1.3.1 ica_1.0-3 Rcpp_1.0.12
## [55] glue_1.7.0 SparseArray_1.4.3 xfun_0.44
## [58] withr_3.0.0 fastmap_1.2.0 fansi_1.0.6
## [61] digest_0.6.35 timechange_0.3.0 R6_2.5.1
## [64] mime_0.12 textshaping_0.3.7 colorspace_2.1-0
## [67] scattermore_1.2 tensor_1.5 spatstat.data_3.0-4
## [70] utf8_1.2.4 generics_0.1.3 data.table_1.15.4
## [73] httr_1.4.7 htmlwidgets_1.6.4 S4Arrays_1.4.0
## [76] uwot_0.2.2 pkgconfig_2.0.3 gtable_0.3.5
## [79] lmtest_0.9-40 XVector_0.44.0 htmltools_0.5.8.1
## [82] carData_3.0-5 dotCall64_1.1-1 scales_1.3.0
## [85] png_0.1-8 knitr_1.46 rstudioapi_0.16.0
## [88] tzdb_0.4.0 nlme_3.1-164 cachem_1.0.8
## [91] zoo_1.8-12 KernSmooth_2.23-22 parallel_4.4.0
## [94] miniUI_0.1.1.1 pillar_1.9.0 grid_4.4.0
## [97] vctrs_0.6.5 RANN_2.6.1 promises_1.3.0
## [100] car_3.1-2 xtable_1.8-4 cluster_2.1.6
## [103] evaluate_0.23 cli_3.6.2 locfit_1.5-9.9
## [106] compiler_4.4.0 rlang_1.1.3 crayon_1.5.2
## [109] future.apply_1.11.2 ggsignif_0.6.4 labeling_0.4.3
## [112] plyr_1.8.9 stringi_1.8.4 deldir_2.0-4
## [115] BiocParallel_1.38.0 munsell_0.5.1 lazyeval_0.2.2
## [118] spatstat.geom_3.2-9 RcppHNSW_0.6.0 hms_1.1.3
## [121] future_1.33.2 shiny_1.8.1.1 highr_0.10
## [124] ROCR_1.0-11 igraph_2.0.3 broom_1.0.5
## [127] bslib_0.7.0